suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
Settings
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)
#figdir <- paste0(wd, 'Figures/DRS_m3C_sites/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/mRNAs/')
fastadir <- paste0(wd, 'Fasta/DRS/Kmer_range/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
Functions
paste_wd <- function(path) {
paste0(wd, path)
}
Read data
RNA sequence of m3C RNAs
espresso_AsPC1_transcriptome_seqs <-
read_tsv(
'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> paste_wd()
)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_transcriptome_seqs
## # A tibble: 36,717 × 3
## transcript_id transcript_seq transcript_length
## <chr> <chr> <dbl>
## 1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA… 987
## 2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCA… 2252
## 3 ENST00000420393.5 CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGG… 854
## 4 ENST00000698415.1 GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTC… 6597
## 5 ENST00000698416.1 CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTC… 5500
## 6 ENST00000488263.5 AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTT… 4528
## 7 ENST00000424814.5 GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACC… 2038
## 8 ENST00000231948.9 AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCA… 2187
## 9 ENST00000432408.6 GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAG… 2203
## 10 ENST00000459840.5 ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTG… 723
## # ℹ 36,707 more rows
m3C positions
DRS_methylated_positions_CDSpos <-
read_tsv(
'Tables/DRS_m3C_sites/Metagene_CDS/DRS_methylated_positions_CDSpos_2024-06-05.tsv' |>
paste_wd()
) |>
left_join(espresso_AsPC1_transcriptome_seqs)
## Rows: 436 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2,...
## dbl (11): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kme...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
DRS_methylated_positions_CDSpos
## # A tibble: 436 × 20
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCA 33 37
## 5 ENST00000361390.2 MT-ND1 chrM protein_cod… CCCCT 123 127
## 6 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCT 141 145
## 7 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCG 186 190
## 8 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCCT 205 209
## 9 ENST00000361390.2 MT-ND1 chrM protein_cod… CCCCC 260 264
## 10 ENST00000361390.2 MT-ND1 chrM protein_cod… ACCTC 322 326
## # ℹ 426 more rows
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## # start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## # kmer_region <chr>, transcript_seq <chr>, transcript_length <dbl>
Get neibor sequences of the m3C sites (mRNA and MT-mRNAs)
DRS_methylated_positions_CDSpos_neiborseq <-
DRS_methylated_positions_CDSpos |>
mutate(
neibor_seq = str_sub(transcript_seq, kmer_start - 5, kmer_end + 5)
) |>
select(transcript_id, kmer_middle, ref_kmer, neibor_seq, genetype2, kmer_region) |>
mutate(name = paste(transcript_id, kmer_middle, genetype2, kmer_region, sep = '|'))
DRS_methylated_positions_CDSpos_neiborseq |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/mRNAs/DRS_methylated_positions_CDSpos_neiborseq_2024-06-10.tsv.gz
## # A tibble: 436 × 7
## transcript_id kmer_middle ref_kmer neibor_seq genetype2 kmer_region name
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000429711.7 425 GCCCA GAGCTGCCC… mRNA CDS ENST…
## 2 ENST00000647248.2 383 ACCCC GCTGTACCC… mRNA CDS ENST…
## 3 ENST00000647248.2 384 CCCCT CTGTACCCC… mRNA CDS ENST…
## 4 ENST00000361390.2 35 ACCCA ATTGTACCC… mt-mRNA CDS ENST…
## 5 ENST00000361390.2 125 CCCCT GTAGGCCCC… mt-mRNA CDS ENST…
## 6 ENST00000361390.2 143 ACCCT CTACAACCC… mt-mRNA CDS ENST…
## 7 ENST00000361390.2 188 ACCCG CTAAAACCC… mt-mRNA CDS ENST…
## 8 ENST00000361390.2 207 ACCCT CCATCACCC… mt-mRNA CDS ENST…
## 9 ENST00000361390.2 262 CCCCC ATGAACCCC… mt-mRNA CDS ENST…
## 10 ENST00000361390.2 324 ACCTC TAGCCACCT… mt-mRNA CDS ENST…
## # ℹ 426 more rows
Export fasta
mRNA
for (region in unique(DRS_methylated_positions_CDSpos_neiborseq$kmer_region)) {
print(region)
fasta_basename <- paste0('DRS_methylated_positions_mRNAs_neiborseq_', region)
filtered_df <-
DRS_methylated_positions_CDSpos_neiborseq |>
filter(genetype2 == 'mRNA') |>
filter(kmer_region == region)
print(filtered_df)
filtered_df |>
export_as_fasta(
name = name, sequence = neibor_seq,
fasta_basename = fasta_basename, outdir = fastadir
)
}
## [1] "CDS"
## # A tibble: 179 × 7
## transcript_id kmer_middle ref_kmer neibor_seq genetype2 kmer_region name
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000429711.7 425 GCCCA GAGCTGCCC… mRNA CDS ENST…
## 2 ENST00000647248.2 383 ACCCC GCTGTACCC… mRNA CDS ENST…
## 3 ENST00000647248.2 384 CCCCT CTGTACCCC… mRNA CDS ENST…
## 4 ENST00000215754.8 182 GCCAC CGCAGGCCA… mRNA CDS ENST…
## 5 ENST00000215754.8 193 GCCCC GGCAAGCCC… mRNA CDS ENST…
## 6 ENST00000270625.7 163 ACCCA AAGACACCC… mRNA CDS ENST…
## 7 ENST00000270625.7 475 CACCA GCCGGCACC… mRNA CDS ENST…
## 8 ENST00000331825.… 347 GCCAC CGTGAGCCA… mRNA CDS ENST…
## 9 ENST00000331825.… 395 GTCTC CGAGCGTCT… mRNA CDS ENST…
## 10 ENST00000331825.… 487 GCCAT CAGACGCCA… mRNA CDS ENST…
## # ℹ 169 more rows
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS/Kmer_range/DRS_methylated_positions_mRNAs_neiborseq_CDS.fasta
##
## [1] "fiveprimeUTR"
## # A tibble: 30 × 7
## transcript_id kmer_middle ref_kmer neibor_seq genetype2 kmer_region name
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000215754.8 81 GTCCT TCCTGGTCC… mRNA fiveprimeU… ENST…
## 2 ENST00000331825.… 23 GTCTG CGCGGGTCT… mRNA fiveprimeU… ENST…
## 3 ENST00000331825.… 124 ACCAT CCGGGACCA… mRNA fiveprimeU… ENST…
## 4 ENST00000331825.… 195 ACCAA TGCCAACCA… mRNA fiveprimeU… ENST…
## 5 ENST00000501597.3 60 GCCAT TTAGCGCCA… mRNA fiveprimeU… ENST…
## 6 ENST00000501597.3 83 GCCAT TCTGCGCCA… mRNA fiveprimeU… ENST…
## 7 ENST00000392514.9 61 TCCCT GGCAATCCC… mRNA fiveprimeU… ENST…
## 8 ENST00000321153.9 46 CTCCG GACTTCTCC… mRNA fiveprimeU… ENST…
## 9 ENST00000273550.… 62 ACCCG ACGGAACCC… mRNA fiveprimeU… ENST…
## 10 ENST00000273550.… 104 GCCCT AGCCAGCCC… mRNA fiveprimeU… ENST…
## # ℹ 20 more rows
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS/Kmer_range/DRS_methylated_positions_mRNAs_neiborseq_fiveprimeUTR.fasta
##
## [1] "threeprimeUTR"
## # A tibble: 45 × 7
## transcript_id kmer_middle ref_kmer neibor_seq genetype2 kmer_region name
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 ENST00000215754.8 486 ACCCG CGGGAACCC… mRNA threeprime… ENST…
## 2 ENST00000199764.7 1700 TTCAG TCCTTTTCA… mRNA threeprime… ENST…
## 3 ENST00000552551.5 1981 ACCCA AGGAGACCC… mRNA threeprime… ENST…
## 4 ENST00000552551.5 2010 GCCCA CCTCAGCCC… mRNA threeprime… ENST…
## 5 ENST00000501597.3 260 GTCTA CTACTGTCT… mRNA threeprime… ENST…
## 6 ENST00000501597.3 281 ATCTA AATGGATCT… mRNA threeprime… ENST…
## 7 ENST00000501597.3 296 GCCCT TCATCGCCC… mRNA threeprime… ENST…
## 8 ENST00000501597.3 314 ACCTC CGATCACCT… mRNA threeprime… ENST…
## 9 ENST00000501597.3 323 ACCCA CTGAGACCC… mRNA threeprime… ENST…
## 10 ENST00000501597.3 371 ACCTG CCTGGACCT… mRNA threeprime… ENST…
## # ℹ 35 more rows
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Fasta/DRS/Kmer_range/DRS_methylated_positions_mRNAs_neiborseq_threeprimeUTR.fasta